rm(list=ls(all=TRUE)) # clear workspace
graphics.off() # closes all graphics
Thanks due to https://huntsmancancerinstitute.github.io/hciR/pasilla_DESeq.html
This guide follows the Bioconductor RNA-Seq workflow to find differentially expressed genes using DESeq2. Load the hciR package to run the R code.
Load the counts and samples using the readr package.
packages(readr)
packages(microbiome)
samples <- read_tsv("C:/Users/Ryan/Google Drive/UC Davis/Publications/MFC Microbiome Paper - Experiment 15/metadata_frd_exp15.txt")
counts <- read_tsv("metagenome.tab", quote = "")
# # A tibble: 6 x 19
# OTU_ID `12-MS515F` `1-MS515F` `13-MS515F` `10-MS515F` `2-MS515F` `3-MS515F` `4-MS515F` `5-MS515F` `6-MS515F`
# <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 K01365 0 0 0 0 0 0 0 0 0
# 2 K01364 0 0 0 0 0 0 0 0 0
# 3 K01361 4 11 2 6 0 14 0 7 6
# 4 K01360 0 0 0 0 0 0 0 0 0
# 5 K01362 18555 19442 18887 18768 11216 18297 23687 17577 20485
# 6 K02249 0 0 0 2 0 1 0 0 0
# # ... with 9 more variables: `7-MS515F` <int>, `9-MS515F` <int>, `11-MS515F` <int>, `14-MS515F` <int>,
# # `8-MS515F` <int>, `17-MS515F` <int>, `15-MS515F` <int>, `18-MS515F` <int>, `16-MS515F` <int>
Remove features with zero counts and features with one or fewer reads in any sample.
packages(dplyr)
counts <- filter_counts(counts)
# Removed 1254 features with 0 reads
# Removed 127 features with <=1 maximum reads
head(counts)
# # A tibble: 6 x 19
# OTU_ID `12-MS515F` `1-MS515F` `13-MS515F` `10-MS515F` `2-MS515F` `3-MS515F` `4-MS515F` `5-MS515F` `6-MS515F`
# <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 K01361 4 11 2 6 0 14 0 7 6
# 2 K01362 18555 19442 18887 18768 11216 18297 23687 17577 20485
# 3 K02249 0 0 0 2 0 1 0 0 0
# 4 K05841 1 0 0 0 0 0 10 0 0
# 5 K05844 3836 272 2208 1166 1544 781 158 844 1119
# 6 K05845 4415 16593 9487 11056 7075 12118 17639 12320 13951
# # ... with 9 more variables: `7-MS515F` <int>, `9-MS515F` <int>, `11-MS515F` <int>, `14-MS515F` <int>,
# # `8-MS515F` <int>, `17-MS515F` <int>, `15-MS515F` <int>, `18-MS515F` <int>, `16-MS515F` <int>
Filter data for classes of interest
which.cols <- unlist(samples[which(samples$Sample == "Bristles"|samples$Sample == "Cathode"),"Label"])
counts <- as_tibble(cbind(counts$OTU_ID,counts[,which(colnames(counts) %in% which.cols)]))
names(counts)[names(counts) == 'counts$OTU_ID'] <- 'OTU_ID'
counts$OTU_ID <- as.character(counts$OTU_ID)
head(counts)
# # A tibble: 6 x 9
# OTU_ID `12-MS515F` `13-MS515F` `11-MS515F` `14-MS515F` `17-MS515F` `15-MS515F` `18-MS515F` `16-MS515F`
# <chr> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 K01361 4 2 5 5 11 13 40 15
# 2 K01362 18555 18887 25754 21057 27253 22671 32389 31287
# 3 K02249 0 0 0 0 2 2 2 1
# 4 K05841 1 0 0 0 3 12 9 68
# 5 K05844 3836 2208 6645 4544 9622 14822 14065 14567
# 6 K05845 4415 9487 4423 3739 10177 2021 3788 5100
samples <- samples[which(samples$Sample == "Bristles"|samples$Sample == "Cathode"),]
samples
# # A tibble: 8 x 6
# Label Name Sample Temperature Substrate Aeration
# <chr> <chr> <chr> <chr> <chr> <chr>
# 1 11-MS515F MFC 1 EXP 15 BRISTLES FRD Bristles 30C Sludge Anaerobic
# 2 12-MS515F MCF 2 EXP 15 BRISTLES FRD Bristles 30C Sludge Anaerobic
# 3 13-MS515F MFC 3 EXP 15 BRISTLES FRD Bristles 30C Sludge Anaerobic
# 4 14-MS515F MFR 4 EXP 15 BRISTLES FRD Bristles 30C Sludge Anaerobic
# 5 15-MS515F MFC 1 EXP 15 CATHODE FRD Cathode 30C Sludge Anaerobic
# 6 16-MS515F MFC 2 EXP 15 CATHODE FRD Cathode 30C Sludge Anaerobic
# 7 17-MS515F MFC 3 EXP 15 CATHODE FRD Cathode 30C Sludge Anaerobic
# 8 18-MS515F MFC 4 EXP 15 CATHODE FRD Cathode 30C Sludge Anaerobic
Combine the counts and samples to create a DESeqDataSet object and calculate the regularized log transform (rlog) for sample visualizations.
packages(lazyeval)
packages(DESeq2)
names(samples)[names(samples) == 'Sample'] <- 'condition'
dds <- deseq_from_tibble(counts, samples, design = ~ condition)
rld <- r_log(dds)
Plot the first two principal components using the rlog values from the top 500 variable genes. You can hover over points to view sample names or zoom into groups of points in this interactive highchart.
plot_pca(rld, "condition", tooltip=c("Label") , width=700)
Cluster all the rlog values using the R function dist to calculate the Euclidean distance between samples.
plot_dist(rld , c("condition"), palette="Reds", diagNA=FALSE, reverse_pal=FALSE)
Load/create annotations.
fly <- as_tibble(gene_name_lookup)
colnames(fly) <- c("id","gene_name")
fly <- fly[order(fly$id),]
#add some NA columns
namevector <- c("biotype", "chromosome", "description")
fly[ , namevector] <- NA
head(fly)
# # A tibble: 6 x 5
# id gene_name biotype chromosome
# <chr> <chr> <lgl> <lgl>
# 1 K00001 alcohol dehydrogenase [EC:1.1.1.1] NA NA
# 2 K00002 alcohol dehydrogenase (NADP+) [EC:1.1.1.2] NA NA
# 3 K00003 homoserine dehydrogenase [EC:1.1.1.3] NA NA
# 4 K00004 "\"(R,R)-butanediol dehydrogenase / diacetyl reductase [EC:1.1.1.4 1.1.1.303]\"" NA NA
# 5 K00005 glycerol dehydrogenase [EC:1.1.1.6] NA NA
# 6 K00007 D-arabinitol 4-dehydrogenase [EC:1.1.1.11] NA NA
# # ... with 1 more variables: description <lgl>
Get the annotated DESeq results using a 5% false discovery rate (FDR). The default is to compare all treatment combinations, which only includes one in this study.
Since we downloaded the most recent fly annotations, there are 723 missing (note you could set the version in read_biomart to match the old count table)
res <- results_all(dds, fly, alpha= 0.05)
# Using adjusted p-value < 0.05
# 1. Bristles vs. Cathode: 1595 up and 1298 down regulated
# Note: 5528 rows in results are missing from biomart table
Create a flex dashboard using the top 2000 genes sorted by p-value in pasilla_flex.html. The MA-plot and volcano plot are linked, so you can click and drag to create a box to highlight points and then view matching rows in the table. You can also drag the box around the plot to easily highlight other points and rows. In addition, you can search for genes in the table using the search box, but need to click on the row in order to highlight the points in the plots. The sliders can also be used to limit the results.
# packages(rmarkdown)
# packages(flexdashboard)
# packages(crosstalk)
# packages(d3scatter)
# rmd <- system.file("Rmd", "DESeq_flex.Rmd", package="hciR")
# render(rmd, output_file="pasilla_flex.html", output_dir=".",
# params=list( results= res, title = "Treatments", top= 2000 ))
Save the DESeq results to a single Excel file in pasilla_DESeq.xlsx. The write_deseq function will also output raw counts, rlog values, normalized counts, samples and fly annotations.
# write_deseq(res, dds, rld, fly, file="pasilla_DESeq.xlsx")
Plot the fold changes and p-values in an interactive volcano plot.
plot_volcano(res, padj = 1e-10, log2Fold =1 , width=700)
# Adding mouseover labels to 3256 genes (62.4%)
Select the top 40 genes sorted by p-value and cluster the rlog differences, so values in the heatmap represent the amount a gene deviates in a specific sample from the gene’s average across all samples.
x <- top_counts( res, rld, top=40)
x
# # A tibble: 40 x 9
# id `11-MS515F` `12-MS515F` `13-MS515F`
# <chr> <dbl> <dbl> <dbl>
# 1 hydroxylamine oxidase [EC:1.7.3.4] 12.04587 11.36003 11.26504
# 2 "\"1,3-propanediol dehydrogenase [EC:1.1.1.202]\"" 12.10677 11.45461 11.32907
# 3 nucleoside-triphosphatase THEP1 [EC:3.6.1.15] 11.08803 10.40032 10.34029
# 4 prephenate dehydratase [EC:4.2.1.51] 11.04784 10.34394 10.27744
# 5 thiosulfate reductase [EC:1.-.-.-] 11.11512 10.42496 10.36069
# 6 None 11.08552 10.41803 10.31882
# 7 glycerol-3-phosphate dehydrogenase subunit C [EC:1.1.5.3] 11.07905 10.41837 10.32025
# 8 "\"LysR family transcriptional regulator, positive regulator for ilvC\"" 11.03326 10.32951 10.25655
# 9 nitrogenase molybdenum-iron protein NifN 11.21569 10.60965 10.46374
# 10 putative transposase 11.26020 10.70259 10.48936
# # ... with 30 more rows, and 5 more variables: `14-MS515F` <dbl>, `15-MS515F` <dbl>, `16-MS515F` <dbl>,
# # `17-MS515F` <dbl>, `18-MS515F` <dbl>
plot_genes(x, c("condition") )
Plot the top 400 genes using an interactive d3heatmap. Click and drag over a region in the plot to zoom and better view gene labels.
plot_genes( top_counts( res, rld, top=400), output="d3", xaxis_font_size=12, show_grid=FALSE)